Travel Trends from Mobi Data

This analysis follows Jake VanderPlas's blog post: http://jakevdp.github.io/blog/2015/07/23/learning-seattles-work-habits-from-bicycle-counts/



In [1]:

    
%matplotlib notebook



In [2]:

    
#Load mobi daily data
data = pd.read_pickle('taken_hourly_df.p')
#data = data['2017-06':'2017-09']
data = pd.DataFrame(data.sum(1))
data.loc['2017-06-24'] = np.nan
data.loc['2017-06-25'] = np.nan



In [3]:

    
data.tail()









    Out[3]:







  
    
      
      0
    
    
      time
      
    
  
  
    
      2017-11-03 12:00:00-07:00
      56.0
    
    
      2017-11-03 13:00:00-07:00
      74.0
    
    
      2017-11-03 14:00:00-07:00
      95.0
    
    
      2017-11-03 15:00:00-07:00
      69.0
    
    
      2017-11-03 16:00:00-07:00
      129.0



In [4]:

    
data['2017-06-29']









    Out[4]:







  
    
      
      0
    
    
      time
      
    
  
  
    
      2017-06-29 00:00:00-07:00
      18.0
    
    
      2017-06-29 01:00:00-07:00
      15.0
    
    
      2017-06-29 02:00:00-07:00
      3.0
    
    
      2017-06-29 03:00:00-07:00
      0.0
    
    
      2017-06-29 04:00:00-07:00
      3.0
    
    
      2017-06-29 05:00:00-07:00
      10.0
    
    
      2017-06-29 06:00:00-07:00
      34.0
    
    
      2017-06-29 07:00:00-07:00
      92.0
    
    
      2017-06-29 08:00:00-07:00
      201.0
    
    
      2017-06-29 09:00:00-07:00
      213.0
    
    
      2017-06-29 10:00:00-07:00
      118.0
    
    
      2017-06-29 11:00:00-07:00
      166.0
    
    
      2017-06-29 12:00:00-07:00
      144.0
    
    
      2017-06-29 13:00:00-07:00
      177.0
    
    
      2017-06-29 14:00:00-07:00
      204.0
    
    
      2017-06-29 15:00:00-07:00
      163.0
    
    
      2017-06-29 16:00:00-07:00
      238.0
    
    
      2017-06-29 17:00:00-07:00
      290.0
    
    
      2017-06-29 18:00:00-07:00
      278.0
    
    
      2017-06-29 19:00:00-07:00
      208.0
    
    
      2017-06-29 20:00:00-07:00
      156.0
    
    
      2017-06-29 21:00:00-07:00
      166.0
    
    
      2017-06-29 22:00:00-07:00
      109.0
    
    
      2017-06-29 23:00:00-07:00
      40.0



In [5]:

    
f,ax = plt.subplots()
data.loc['2017-06-28':'2017-07-02'].sum(1).plot()
ax.set_ylabel("Bike/hour")
ax.set_xlabel("")
#f.savefig('daydata2017-06-30.png')









    














    











    Out[5]:





<matplotlib.text.Text at 0x117c1e320>



In [6]:

    
f,ax = plt.subplots()
ax = data.sum(1).plot(kind='line')
ax.set_ylabel('Bike/hour')
ax.set_xlabel('')
f.savefig('hourly_usage_may-sep.png')



In [7]:

    
f,ax = plt.subplots()
data.groupby(pd.TimeGrouper(freq='D')).sum().sum(1).plot()









    














    











    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x117ea42b0>



In [8]:

    
def plotweek(startdate,enddate):
    weekdata = data[startdate:enddate]
    f,ax = plt.subplots()
    ax = weekdata.sum(1).plot()
    ax.set_ylabel('Bike/hour')
    ax.set_xlabel('')
    f.savefig('weekdata-{}-{}.png'.format(startdate,enddate))
    return ax



In [9]:

    
plotweek('2017-07-31','2017-08-06')









    














    











    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x117e0bb70>



In [10]:

    
plotweek('2017-08-07','2017-08-13')









    














    











    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x118226d68>

Transform data



In [11]:

    
pivoted = pd.DataFrame(data).pivot_table(
                           index=data.index.date,
                           columns=data.index.hour,
                           fill_value=0)



In [12]:

    
pivoted.head()









    Out[12]:







  
    
      
      0
    
    
      time
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
    
  
  
    
      2017-04-17
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      66
      58
      47.0
      59
      44
      19
      20
      23
      8
      8
    
    
      2017-04-18
      6
      4
      6
      4
      2
      4
      17
      41
      51
      80
      ...
      57
      58
      106.0
      154
      71
      99
      43
      50
      18
      19
    
    
      2017-04-19
      9
      5
      2
      3
      4
      4
      15
      76
      117
      112
      ...
      68
      45
      98.0
      147
      129
      74
      70
      73
      32
      13
    
    
      2017-04-20
      7
      1
      0
      1
      1
      4
      19
      51
      127
      98
      ...
      97
      105
      136.0
      169
      174
      99
      92
      60
      33
      20
    
    
      2017-04-21
      8
      6
      3
      0
      2
      9
      17
      61
      123
      107
      ...
      126
      130
      175.0
      199
      210
      152
      90
      66
      49
      34
    
  

5 rows × 24 columns

Extract raw values into a numpy 2d array



In [14]:

    
X = pivoted.values
print(X.sum())
X.shape

Visualize the data



In [15]:

    
%matplotlib notebook



In [16]:

    
from sklearn.decomposition import PCA
Xpca = PCA(0.9).fit_transform(X)
Xpca.shape









    Out[16]:





(201, 3)



In [17]:

    
dates = list(pivoted.index.values)
dates = np.array([x.strftime("%m %d") for x in dates])
dates.shape









    Out[17]:





(201,)



In [18]:

    
total_trips = X.sum(1)
f,ax = plt.subplots()
scat = ax.scatter(Xpca[:, 0], Xpca[:, 1], c=total_trips, cmap='cool',picker=0.5)
ax.get_xaxis().set_ticks([])
ax.get_yaxis().set_ticks([])
f.colorbar(scat,label='total trips');
f.savefig('PCA_numtrips.png')



In [19]:

    
f2,ax2 = plt.subplots()

dayofweek = pd.to_datetime(pivoted.index).dayofweek
scat2 = ax2.scatter(Xpca[:, 0], Xpca[:, 1], c=dayofweek,
            cmap=plt.cm.get_cmap('jet', 7))
cb = f2.colorbar(scat2,ticks=range(7))
ax2.get_xaxis().set_ticks([])
ax2.get_yaxis().set_ticks([])
cb.set_ticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
#f2.clim(-0.5, 6.5);
f2.savefig('PCA_dayofweek.png')



In [26]:

    
f2,ax2 = plt.subplots()

dayofweek = pd.to_datetime(pivoted.index).dayofweek
scat2 = ax2.scatter(Xpca[:, 0], Xpca[:, 1], c=dayofweek,
            cmap=plt.cm.get_cmap('jet', 7))
cb = f2.colorbar(scat2,ticks=range(7))
ax2.get_xaxis().set_ticks([])
ax2.get_yaxis().set_ticks([])
cb.set_ticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
#f2.clim(-0.5, 6.5);

for date,x,y,dow in zip(dates,Xpca[:,0],Xpca[:,1],dayofweek):
    if dow > 4 and x>15:
        ax2.annotate(date,(x,y),xytext=(-20,-60),textcoords='offset points',arrowprops=dict(facecolor='black', headwidth=10, alpha=0.4, width=1))
f2.savefig('PCA_dayofweek_lowvolweekends.png')



In [27]:

    
f2,ax2 = plt.subplots()

dayofweek = pd.to_datetime(pivoted.index).dayofweek
scat2 = ax2.scatter(Xpca[:, 0], Xpca[:, 1], c=dayofweek,
            cmap=plt.cm.get_cmap('jet', 7))
cb = f2.colorbar(scat2,ticks=range(7))
ax2.get_xaxis().set_ticks([])
ax2.get_yaxis().set_ticks([])
cb.set_ticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
#f2.clim(-0.5, 6.5);

for date,x,y,dow in zip(dates,Xpca[:,0],Xpca[:,1],dayofweek):
    if dow == 0 and y>20:
        ax2.annotate(date,(x,y),xytext=(20,-60),textcoords='offset points',arrowprops=dict(facecolor='black', headwidth=10, alpha=0.4, width=1))
f2.savefig('PCA_dayofweek_mondays.png')



In [28]:

    
month = list(pivoted.index.values)
month = np.array([int(x.strftime("%m")) for x in month])
month;



In [29]:

    
f2,ax2 = plt.subplots()

dayofweek = pd.to_datetime(pivoted.index).dayofweek
scat2 = ax2.scatter(Xpca[:, 0], Xpca[:, 1], c=month,
            cmap=plt.cm.get_cmap('jet', 4))
cb = f2.colorbar(scat2,ticks=range(6,10))
ax2.get_xaxis().set_ticks([])
ax2.get_yaxis().set_ticks([])
cb.set_ticklabels(['Jun', 'Jul', 'Aug', 'Sep',''])
#f2.clim(-0.5, 6.5);



In [30]:

    
dayofweek.shape









    Out[30]:





(197,)



In [31]:

    
pivoted.index[-1]









    Out[31]:





datetime.date(2017, 10, 30)



In [32]:

    
pd.to_datetime(pivoted.index[-1]).dayofweek









    Out[32]:





0

Unsupervised Clustering



In [34]:

    
f3,ax3 = plt.subplots()

# Updates GaussianMixture from JVP's blog
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2)
gmm.fit(Xpca)
cluster_label = gmm.predict(Xpca)
ax3.get_xaxis().set_ticks([])
ax3.get_yaxis().set_ticks([])
ax3.scatter(Xpca[:,0], Xpca[:,1],c=cluster_label)
f3.savefig('PCA_clustering.png')

To Do: Linear Discriminant Analysis



In [ ]:

	0
time
2017-11-03 12:00:00-07:00	56.0
2017-11-03 13:00:00-07:00	74.0
2017-11-03 14:00:00-07:00	95.0
2017-11-03 15:00:00-07:00	69.0
2017-11-03 16:00:00-07:00	129.0

	0
time
2017-06-29 00:00:00-07:00	18.0
2017-06-29 01:00:00-07:00	15.0
2017-06-29 02:00:00-07:00	3.0
2017-06-29 03:00:00-07:00	0.0
2017-06-29 04:00:00-07:00	3.0
2017-06-29 05:00:00-07:00	10.0
2017-06-29 06:00:00-07:00	34.0
2017-06-29 07:00:00-07:00	92.0
2017-06-29 08:00:00-07:00	201.0
2017-06-29 09:00:00-07:00	213.0
2017-06-29 10:00:00-07:00	118.0
2017-06-29 11:00:00-07:00	166.0
2017-06-29 12:00:00-07:00	144.0
2017-06-29 13:00:00-07:00	177.0
2017-06-29 14:00:00-07:00	204.0
2017-06-29 15:00:00-07:00	163.0
2017-06-29 16:00:00-07:00	238.0
2017-06-29 17:00:00-07:00	290.0
2017-06-29 18:00:00-07:00	278.0
2017-06-29 19:00:00-07:00	208.0
2017-06-29 20:00:00-07:00	156.0
2017-06-29 21:00:00-07:00	166.0
2017-06-29 22:00:00-07:00	109.0
2017-06-29 23:00:00-07:00	40.0

	0
time	0	1	2	3	4	5	6	7	8	9	...	14	15	16	17	18	19	20	21	22	23
2017-04-17	0	0	0	0	0	0	0	0	0	0	...	66	58	47.0	59	44	19	20	23	8	8
2017-04-18	6	4	6	4	2	4	17	41	51	80	...	57	58	106.0	154	71	99	43	50	18	19
2017-04-19	9	5	2	3	4	4	15	76	117	112	...	68	45	98.0	147	129	74	70	73	32	13
2017-04-20	7	1	0	1	1	4	19	51	127	98	...	97	105	136.0	169	174	99	92	60	33	20
2017-04-21	8	6	3	0	2	9	17	61	123	107	...	126	130	175.0	199	210	152	90	66	49	34